In [19]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
%matplotlib inline
The log format I use for nginx:
'$remote_addr - $remote_user [$time_local] '
'"$request" $status $body_bytes_sent '
'"$http_referer" "$http_user_agent" '
'$request_time $upstream_response_time $pipe';
The following regex will match
In [20]:
pattern = re.compile(r"""(?P<remote_addr>.+?) \s-\s
(?P<remote_user>.+?) \s
\[(?P<time_local>.+?)\] \s
"(?P<method>.+?) \s (?P<uri>.+?) \s (?P<protocol>.+?)" \s
(?P<status>.+?) \s
(?P<body_bytes_sent>.+?) \s
"(?P<http_referer>.+?)"? \s # Optional final " as I initially missed this when setting the format
"(?P<http_user_agent>.*?)" \s
(?P<request_time>.+?) \s
(?P<upstream_response_time>.+?) \s
(?P<pipe>.+?)""", re.VERBOSE)
In [21]:
test = """195.211.155.181 - - [18/Jan/2015:22:43:05 +0000] "GET /blog/email-templates-using-zend-framework/ HTTP/1.1" """ + \
"""200 7408 "-" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Mozilla/4.0 (compatible; MSIE 6.0; """ + \
"""Windows NT 5.1; SV1) )" 0.039 0.039 ."""
m = re.match(pattern, test)
m.groupdict()
Out[21]:
With our working regex pattern we can parse every line in the log file and store the data in a list of dicts
In [22]:
log_file = '/home/jonathan/dev/access.log'
with open(log_file) as f:
log_list = []
for i in f.readlines():
m = re.match(pattern, i)
log_list.append(m.groupdict())
Converting the data to a pandas DataFrame only requires passing the list to pd.DataFrame
In [23]:
log = pd.DataFrame(log_list)
print(log.head(1))
In [24]:
log.info()
Currently, all columns are object type. The data will be easier to work with if the types are converted where appropriate
status is a numerical code so could be converted to an integer but as this is a categorical variable we probably don't need to convert it
In [25]:
log['time_local'] = pd.to_datetime(log['time_local'].apply(lambda x : x.split(' ')[0]), format='%d/%b/%Y:%H:%M:%S')
log.loc[log['upstream_response_time'] == '-', 'upstream_response_time'] = np.nan
log[['upstream_response_time', 'request_time']] = log[['upstream_response_time', 'request_time']].astype(np.float64)
log['body_bytes_sent'] = log['body_bytes_sent'].astype(np.int64)
log.info()
The index is currently the default, an incrementing number (in this case 0 to 335084).
Setting the index to the time of the request will simplify analysis later.
In [26]:
log.index = log.pop('time_local')
log.info()
Basic summary information is available with the describe method
In [27]:
log.describe()
Out[27]:
Some of the requests take a long time. Let's take a closer look
In [28]:
mask = log['upstream_response_time'] > 1.0
log[mask].shape
Out[28]:
In [29]:
selected_columns = ['request_time', 'status', 'upstream_response_time', 'uri']
log.loc[mask, selected_columns].sort('upstream_response_time', ascending=False)
Out[29]:
The slowest requests are on the back end but a couple of blog posts and the atom feed also appear to be slow. Are they always slow or were these requests anomalous?
In [30]:
mask = log['uri'] == '/blog/atom/'
log[mask].describe()
Out[30]:
In [31]:
np.percentile(log.loc[log['uri'] == '/blog/atom/', 'upstream_response_time'], 95)
Out[31]:
In [32]:
log_freq = log['body_bytes_sent'].resample('D', how='count')
In [33]:
log_freq.plot()
Out[33]:
In [34]:
top_uri = log.groupby('uri').size()
top_uri.sort()
top_uri.tail(10)
Out[34]:
In [35]:
uri_mask = log['uri'] == '/blog/bcne3-search-phpbb-with-sphinx/'
selected_columns = ['remote_addr', 'method', 'http_user_agent', 'http_referer']
log.loc[mask, selected_columns].describe()
Out[35]:
In [36]:
uri_mask = log['uri'] == '/blog/bcne3-search-phpbb-with-sphinx/'
method_mask = log['method'] == 'POST'
mask = uri_mask & method_mask
selected_columns = ['remote_addr', 'method', 'http_user_agent', 'http_referer']
log.loc[mask, selected_columns].describe()
Out[36]: